{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pandas-Dask基本数据处理速度比较"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "内容介绍:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import dask.dataframe as dd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13.8 s, sys: 1.02 s, total: 14.8 s\n",
      "Wall time: 14.9 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>JE_HEADER_ID</th>\n",
       "      <th>机构代码</th>\n",
       "      <th>机构说明</th>\n",
       "      <th>科目代码</th>\n",
       "      <th>科目说明</th>\n",
       "      <th>产品代码</th>\n",
       "      <th>产品说明</th>\n",
       "      <th>渠道代码</th>\n",
       "      <th>凭证来源</th>\n",
       "      <th>凭证编号</th>\n",
       "      <th>凭证名</th>\n",
       "      <th>记账日期</th>\n",
       "      <th>生效日期</th>\n",
       "      <th>行号</th>\n",
       "      <th>借项金额</th>\n",
       "      <th>贷项金额</th>\n",
       "      <th>摘要</th>\n",
       "      <th>期间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8034420</td>\n",
       "      <td>440400</td>\n",
       "      <td>珠海中心支公司</td>\n",
       "      <td>11220400</td>\n",
       "      <td>应收保费-银保通</td>\n",
       "      <td>311404</td>\n",
       "      <td>华夏富贵竹年金保险</td>\n",
       "      <td>3101</td>\n",
       "      <td>个险业务系统</td>\n",
       "      <td>业务207</td>\n",
       "      <td>440400BIZ201701041059 31- 预收保费 CNY</td>\n",
       "      <td>2017/1/4 13:55:44</td>\n",
       "      <td>2016/12/31</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>20161231 预收保费 201701041059</td>\n",
       "      <td>12-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8034361</td>\n",
       "      <td>120100</td>\n",
       "      <td>天津分公司</td>\n",
       "      <td>11220400</td>\n",
       "      <td>应收保费-银保通</td>\n",
       "      <td>311408</td>\n",
       "      <td>华夏财富宝养老年金保险（C款）</td>\n",
       "      <td>3101</td>\n",
       "      <td>个险业务系统</td>\n",
       "      <td>业务354</td>\n",
       "      <td>120100BIZ201701041056 31- 预收保费 CNY</td>\n",
       "      <td>2017/1/4 13:55:44</td>\n",
       "      <td>2016/12/31</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>20161231 预收保费 201701041056</td>\n",
       "      <td>12-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8032760</td>\n",
       "      <td>610500</td>\n",
       "      <td>渭南中心支公司</td>\n",
       "      <td>44110100</td>\n",
       "      <td>退保金</td>\n",
       "      <td>111506</td>\n",
       "      <td>华夏常青树重大疾病保险（2016）</td>\n",
       "      <td>1101</td>\n",
       "      <td>个险业务系统</td>\n",
       "      <td>业务427</td>\n",
       "      <td>610500BIZ201701010059 31- 应付 CNY</td>\n",
       "      <td>2017/1/4 10:35:39</td>\n",
       "      <td>2016/12/31</td>\n",
       "      <td>16</td>\n",
       "      <td>2801.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20161231 应付 201701010059</td>\n",
       "      <td>12-2016</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   JE_HEADER_ID    机构代码     机构说明      科目代码      科目说明    产品代码  \\\n",
       "0       8034420  440400  珠海中心支公司  11220400  应收保费-银保通  311404   \n",
       "1       8034361  120100    天津分公司  11220400  应收保费-银保通  311408   \n",
       "2       8032760  610500  渭南中心支公司  44110100       退保金  111506   \n",
       "\n",
       "                产品说明  渠道代码    凭证来源   凭证编号                                 凭证名  \\\n",
       "0          华夏富贵竹年金保险  3101  个险业务系统  业务207  440400BIZ201701041059 31- 预收保费 CNY   \n",
       "1    华夏财富宝养老年金保险（C款）  3101  个险业务系统  业务354  120100BIZ201701041056 31- 预收保费 CNY   \n",
       "2  华夏常青树重大疾病保险（2016）  1101  个险业务系统  业务427    610500BIZ201701010059 31- 应付 CNY   \n",
       "\n",
       "                记账日期        生效日期  行号    借项金额     贷项金额  \\\n",
       "0  2017/1/4 13:55:44  2016/12/31   1     NaN  10000.0   \n",
       "1  2017/1/4 13:55:44  2016/12/31   2     NaN  50000.0   \n",
       "2  2017/1/4 10:35:39  2016/12/31  16  2801.0      NaN   \n",
       "\n",
       "                           摘要       期间  \n",
       "0  20161231 预收保费 201701041059  12-2016  \n",
       "1  20161231 预收保费 201701041056  12-2016  \n",
       "2    20161231 应付 201701010059  12-2016  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "# pandas获取数据\n",
    "df = pd.read_csv('/home/ubuntu/Documents/12-2016_yw.csv',encoding='GB18030')#,encoding='utf-8'\n",
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.28 s, sys: 335 ms, total: 3.62 s\n",
      "Wall time: 3.62 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>JE_HEADER_ID</th>\n",
       "      <th>机构代码</th>\n",
       "      <th>机构说明</th>\n",
       "      <th>科目代码</th>\n",
       "      <th>科目说明</th>\n",
       "      <th>产品代码</th>\n",
       "      <th>产品说明</th>\n",
       "      <th>渠道代码</th>\n",
       "      <th>凭证来源</th>\n",
       "      <th>凭证编号</th>\n",
       "      <th>凭证名</th>\n",
       "      <th>记账日期</th>\n",
       "      <th>生效日期</th>\n",
       "      <th>行号</th>\n",
       "      <th>借项金额</th>\n",
       "      <th>贷项金额</th>\n",
       "      <th>摘要</th>\n",
       "      <th>期间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8034420</td>\n",
       "      <td>440400</td>\n",
       "      <td>珠海中心支公司</td>\n",
       "      <td>11220400</td>\n",
       "      <td>应收保费-银保通</td>\n",
       "      <td>311404</td>\n",
       "      <td>华夏富贵竹年金保险</td>\n",
       "      <td>3101</td>\n",
       "      <td>个险业务系统</td>\n",
       "      <td>业务207</td>\n",
       "      <td>440400BIZ201701041059 31- 预收保费 CNY</td>\n",
       "      <td>2017/1/4 13:55:44</td>\n",
       "      <td>2016/12/31</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>20161231 预收保费 201701041059</td>\n",
       "      <td>12-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8034361</td>\n",
       "      <td>120100</td>\n",
       "      <td>天津分公司</td>\n",
       "      <td>11220400</td>\n",
       "      <td>应收保费-银保通</td>\n",
       "      <td>311408</td>\n",
       "      <td>华夏财富宝养老年金保险（C款）</td>\n",
       "      <td>3101</td>\n",
       "      <td>个险业务系统</td>\n",
       "      <td>业务354</td>\n",
       "      <td>120100BIZ201701041056 31- 预收保费 CNY</td>\n",
       "      <td>2017/1/4 13:55:44</td>\n",
       "      <td>2016/12/31</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>20161231 预收保费 201701041056</td>\n",
       "      <td>12-2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8032760</td>\n",
       "      <td>610500</td>\n",
       "      <td>渭南中心支公司</td>\n",
       "      <td>44110100</td>\n",
       "      <td>退保金</td>\n",
       "      <td>111506</td>\n",
       "      <td>华夏常青树重大疾病保险（2016）</td>\n",
       "      <td>1101</td>\n",
       "      <td>个险业务系统</td>\n",
       "      <td>业务427</td>\n",
       "      <td>610500BIZ201701010059 31- 应付 CNY</td>\n",
       "      <td>2017/1/4 10:35:39</td>\n",
       "      <td>2016/12/31</td>\n",
       "      <td>16</td>\n",
       "      <td>2801.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20161231 应付 201701010059</td>\n",
       "      <td>12-2016</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   JE_HEADER_ID    机构代码     机构说明      科目代码      科目说明    产品代码  \\\n",
       "0       8034420  440400  珠海中心支公司  11220400  应收保费-银保通  311404   \n",
       "1       8034361  120100    天津分公司  11220400  应收保费-银保通  311408   \n",
       "2       8032760  610500  渭南中心支公司  44110100       退保金  111506   \n",
       "\n",
       "                产品说明  渠道代码    凭证来源   凭证编号                                 凭证名  \\\n",
       "0          华夏富贵竹年金保险  3101  个险业务系统  业务207  440400BIZ201701041059 31- 预收保费 CNY   \n",
       "1    华夏财富宝养老年金保险（C款）  3101  个险业务系统  业务354  120100BIZ201701041056 31- 预收保费 CNY   \n",
       "2  华夏常青树重大疾病保险（2016）  1101  个险业务系统  业务427    610500BIZ201701010059 31- 应付 CNY   \n",
       "\n",
       "                记账日期        生效日期  行号    借项金额     贷项金额  \\\n",
       "0  2017/1/4 13:55:44  2016/12/31   1     NaN  10000.0   \n",
       "1  2017/1/4 13:55:44  2016/12/31   2     NaN  50000.0   \n",
       "2  2017/1/4 10:35:39  2016/12/31  16  2801.0      NaN   \n",
       "\n",
       "                           摘要       期间  \n",
       "0  20161231 预收保费 201701041059  12-2016  \n",
       "1  20161231 预收保费 201701041056  12-2016  \n",
       "2    20161231 应付 201701010059  12-2016  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "#使用Dask模块读取csv数据\n",
    "ddf = dd.read_csv('/home/ubuntu/Documents/12-2016_yw.csv',encoding='GB18030')#,encoding='utf-8'\n",
    "ddf.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
